Data Overview

Dataset Name: Monitoring of Federal Criminal Sentences

Year: 2018-2019

Source: United States Sentencing Commission Link to Data

Summary and Previous Work: The data files encompass all records received by the United States Sentencing Commission with sentencing dates falling within the period from October 1, 2018, to September 30, 2019. This data set has been cited a total of 11 times most of which focus on analyzing the disparities among race and gender using techniques such as random coefficient models, logistic regression and ordinary least squares regression. We are hoping to expand on this knowledge by focusing on sentence lengths of drug related crimes.

Question: What factors affect the length of incarceration, in months, for all drug trafficking offenses?

Variables: Table with Relevant Variables and Definitions

Reading the data

#library(dplyr)
#dat2019 = read.csv('C:/Users/cakus/Downloads/37990-0001-Data.tsv', sep = '\t', header = T)

Saving the data to .csv

#library("readr")
library(tidyverse)
library(dplyr)
library(ggplot2)
library(plotly)
#write.csv(dat2019, "Sentencing-Data_Imported.csv")
dat2019 = read.csv("Sentencing_Data_Imported.csv")

Filtering Data

overall2019 = dat2019 %>% 
  dplyr::select(SENTTCAP, OFFGUIDE, AGE, WGT1, BASEHI, MONSEX, COMBDRG2, 
       CITIZEN, NEWEDUC, WEAPON, ACCCAT, DISTRICT, AMENDYR, IS924C, MONACCEP,
       NEWCNVTN, NUMDEPEN, DRUGMIN, XCRHISSR, MONRACE, HISPORIG)
#colnames(dat2019)

Subset (including ALL drug types, no COMBDRG2 subset)

filter2019 = filter(overall2019,
                    XCRHISSR == 6,
                    COMBDRG2 == 1 | COMBDRG2 == 2 | COMBDRG2 == 3 | COMBDRG2 == 4 | COMBDRG2 == 5 | COMBDRG2 == 6 | COMBDRG2 == 8,
                    WGT1 <= 10000,
          OFFGUIDE == 10) %>% 
  na.omit()

Exploratory Data Analysis

Boxplot of SENTTCAP and NEWCNVTN

filter2019$NEWCNVTN = factor(filter2019$NEWCNVTN,
                                 labels=c("Plea Deal","Trial"))

ggplot(filter2019,aes(x=NEWCNVTN,y=SENTTCAP, fill=NEWCNVTN)) +
  geom_boxplot(outlier.size=2) +
  labs(title="Boxplot of Sentence Length and Plea/Trial",
       x="Plea/Trial", y="Sentence Length") +
    theme(plot.title=element_text(hjust = 0.5))

#FIVE NUMBER SUMMARY with mean
filter2019 %>% 
  group_by(NEWCNVTN) %>% 
  summarise(min = fivenum(SENTTCAP)[1],
            Q1 = fivenum(SENTTCAP)[2],
            median = fivenum(SENTTCAP)[3],
            Q3 = fivenum(SENTTCAP)[4],
            max = fivenum(SENTTCAP)[5],
            mean = mean (SENTTCAP))

Scatterplot of SENTTCAP vs WGT1, grouped by COMBDRG2

ggplot(filter2019, aes(y=SENTTCAP, x=WGT1, color=COMBDRG2)) +
  geom_point() +
  labs(title="Sentence Length vs Weight", y="Sentence Length",
       x="Weight of Drug") + 
      theme(plot.title=element_text(hjust = 0.5)) +
  guides(col=guide_legend("Type of Drug"))

Scatterplot SENTTCAP vs WGT1, Grouped by NEWCNVTN

ggplot(filter2019, aes(y=SENTTCAP, x=WGT1, color=NEWCNVTN)) +
  geom_point() +
  labs(title="Sentence Length vs Weight, Grouped by Plea/Trial", 
       y="Sentence Length",
       x="Weight of Drug") + 
      theme(plot.title=element_text(hjust = 0.5)) +
  guides(col=guide_legend("Plea/Trial"))

Boxplot of SENTTCAP and NEWEDUC

filter2019$NEWEDUC = factor(filter2019$NEWEDUC,
                                 labels=c("Less than HS Grad","HS Grad","Some College","College Grad"))

ggplot(filter2019,aes(x=NEWEDUC,y=SENTTCAP, fill=NEWEDUC)) +
  geom_boxplot(outlier.size=2) +
  labs(title="Boxplot of Sentence Length and Education Level",
       x="Education Level", y="Sentence Length") +
    theme(plot.title=element_text(hjust = 0.5))

#FIVE NUMBER SUMMARY with mean
filter2019 %>% 
  group_by(NEWEDUC) %>% 
  summarise(min = fivenum(SENTTCAP)[1],
            Q1 = fivenum(SENTTCAP)[2],
            median = fivenum(SENTTCAP)[3],
            Q3 = fivenum(SENTTCAP)[4],
            max = fivenum(SENTTCAP)[5],
            mean = mean (SENTTCAP))

Boxplot of SENTTCAP and IS924C

filter2019$IS924C = factor(filter2019$IS924C,
                                 labels=c("No","Yes"))

ggplot(filter2019,aes(x=IS924C,y=SENTTCAP, fill=IS924C)) +
  geom_boxplot(outlier.size=2) +
  labs(title="Boxplot of Sentence Length and Weapon Enhancement Charge",
       x="Weapon Enhancement Charge", y="Sentence Length") +
    theme(plot.title=element_text(hjust = 0.5))

#FIVE NUMBER SUMMARY with mean
filter2019 %>% 
  group_by(IS924C) %>% 
  summarise(min = fivenum(SENTTCAP)[1],
            Q1 = fivenum(SENTTCAP)[2],
            median = fivenum(SENTTCAP)[3],
            Q3 = fivenum(SENTTCAP)[4],
            max = fivenum(SENTTCAP)[5],
            mean = mean (SENTTCAP))

Scatterplot of SENTTCAP vs AGE, grouped by COMBDRG2

ggplot(filter2019, aes(y=SENTTCAP, x=AGE, color=COMBDRG2)) +
  geom_point() +
  labs(title="Sentence Length vs Weight", y="Sentence Length",
       x="Age") + 
      theme(plot.title=element_text(hjust = 0.5)) +
  guides(col=guide_legend("Type of Drug"))

Scatterplot of SENTTCAP vs AGE, Grouped by NEWCNVTN

ggplot(filter2019, aes(y=SENTTCAP, x=AGE, color=NEWCNVTN)) +
  geom_point() +
  labs(title="Sentence Length vs Age, Grouped by Plea/Trial", 
       y="Sentence Length",
       x="Age") + 
      theme(plot.title=element_text(hjust = 0.5)) +
  guides(col=guide_legend("Plea/Trial"))

Scatterplot of SENTTCAP vs BASEHI, grouped by COMBDRG2

ggplot(filter2019, aes(y=SENTTCAP, x=BASEHI, color=COMBDRG2)) +
  geom_point() +
  labs(title="Sentence Length vs Weight", y="BASEHI ",
       x="Age") + 
      theme(plot.title=element_text(hjust = 0.5)) +
  guides(col=guide_legend("Type of Drug"))

## Histogram of Sentences

ggplot(filter2019, aes(SENTTCAP)) + geom_histogram(color = "darkblue", fill = "lightblue")+
  labs(title="Sentence Length Distribution",
       x="Sentence Length") + 
      theme(plot.title=element_text(hjust = 0.5))

Sentence Length by Gender

ggplot(filter2019,aes(x=factor(MONSEX,levels=c("0","1"),labels=c("Male","Female")),y=SENTTCAP,group=MONSEX))+
  geom_boxplot()+
  labs(x="Sex",y="Sentence Length")

Sentence Length by Citizenship Status

p=ggplot(filter2019,aes(x=factor(CITIZEN,levels=c("1","2","3","4","5"),
                               labels=c("US citizen","Legal alien","Illegal alien","Not US citizen","Extradited Alien")),y=SENTTCAP))  +geom_boxplot() +labs(x="Citizenship Status",y="Sentence Length")

ggplotly(p)

Sentence Length by Race

ggplot(filter2019,aes(x=factor(MONRACE,levels=c("1","2","3","4","5","9"),labels=c("White","Black","American Indian","Asian/PI","Multi-racial","Non-US Native American")),y=SENTTCAP))+
  geom_boxplot()+labs(x="Race",y="Sentence Length")

Sentence Length by Hispanic or not

ggplot(filter2019,aes(x=factor(HISPORIG,levels=c("0","1","2"),labels=c("NA","Non-Hispanic","Hispanic")),y=SENTTCAP,group=HISPORIG))+
  geom_boxplot()+
  labs(x="Hispanic or Not",y="Sentence Length")

Modeling Plan

We will be modeling sentence lengths using logistic regression and decision trees. We will use background research and exploratory analysis to find variables that correlate with sentence length and then create several iterations of models and trees and use metrics such as mean square error, adjusted r-squared, AIC and BIC to evaluate the different models.

Linear Models

Model 1

filter2019$logWGT1=log(filter2019$WGT1)

#Testing and Training Set
sample <- sample(c(TRUE, FALSE), nrow(filter2019), replace=TRUE, prob=c(0.8,0.2))
train  <- filter2019[sample, ]
test   <- filter2019[!sample, ]

filter2019$COMBDRG2 = as.factor(filter2019$COMBDRG2)
contrasts(filter2019$COMBDRG2)<-matrix(c(1,0,0,0,0, 0,1,0,0,0, 0,0,1,0,0,  0,0,0,0,1),nrow=5)

#First Model with all relevant variables from previous research and EDA
trial <- lm(SENTTCAP ~ AGE + as.factor(MONSEX) +as.factor(MONRACE) + as.factor(NEWEDUC) + logWGT1 + BASEHI + as.factor(IS924C) + as.factor(NEWCNVTN) + as.factor(CITIZEN) + as.factor(MONACCEP) + as.factor(HISPORIG)+ as.factor(COMBDRG2), data = train)
summary(trial)
## 
## Call:
## lm(formula = SENTTCAP ~ AGE + as.factor(MONSEX) + as.factor(MONRACE) + 
##     as.factor(NEWEDUC) + logWGT1 + BASEHI + as.factor(IS924C) + 
##     as.factor(NEWCNVTN) + as.factor(CITIZEN) + as.factor(MONACCEP) + 
##     as.factor(HISPORIG) + as.factor(COMBDRG2), data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -195.017  -40.609   -5.218   41.927  282.624 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     47.9287    25.3402   1.891 0.059012 .  
## AGE                             -0.2360     0.2984  -0.791 0.429169    
## as.factor(MONSEX)1             -22.6032    12.2198  -1.850 0.064807 .  
## as.factor(MONRACE)2              2.4847     6.9175   0.359 0.719572    
## as.factor(MONRACE)3            -14.7550    22.7142  -0.650 0.516184    
## as.factor(MONRACE)4             11.0435    27.4227   0.403 0.687291    
## as.factor(MONRACE)5             24.8883    62.2702   0.400 0.689521    
## as.factor(MONRACE)7            -59.8815    46.0976  -1.299 0.194398    
## as.factor(MONRACE)8            -65.8381    45.9950  -1.431 0.152790    
## as.factor(NEWEDUC)HS Grad        1.1125     5.6672   0.196 0.844431    
## as.factor(NEWEDUC)Some College  -2.9520     7.5928  -0.389 0.697560    
## as.factor(NEWEDUC)College Grad -27.0848    24.0416  -1.127 0.260334    
## logWGT1                          1.9173     2.5960   0.739 0.460453    
## BASEHI                           3.4311     0.9376   3.659 0.000273 ***
## as.factor(IS924C)Yes            44.2382     7.8833   5.612 2.97e-08 ***
## as.factor(NEWCNVTN)Trial        77.1794    25.6156   3.013 0.002687 ** 
## as.factor(CITIZEN)2            -35.4064    46.5793  -0.760 0.447450    
## as.factor(CITIZEN)3              6.9984    21.8057   0.321 0.748356    
## as.factor(CITIZEN)4            -16.2170    67.9802  -0.239 0.811526    
## as.factor(MONACCEP)-2           15.7886     9.8467   1.603 0.109321    
## as.factor(MONACCEP)0            77.1568    24.9937   3.087 0.002107 ** 
## as.factor(HISPORIG)1            -9.8977    14.6106  -0.677 0.498370    
## as.factor(HISPORIG)2           -24.2593    16.6223  -1.459 0.144925    
## as.factor(COMBDRG2)2             2.4474    11.4143   0.214 0.830288    
## as.factor(COMBDRG2)3             0.7152    10.5679   0.068 0.946066    
## as.factor(COMBDRG2)4           -17.3749    27.9299  -0.622 0.534100    
## as.factor(COMBDRG2)6            17.7573    12.7754   1.390 0.165017    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61.92 on 652 degrees of freedom
## Multiple R-squared:  0.4004, Adjusted R-squared:  0.3765 
## F-statistic: 16.75 on 26 and 652 DF,  p-value: < 2.2e-16

Model 2 - Backwards regression

#Backwards step regression
trial_step = step(trial, trace = 0)
summary(trial_step)
## 
## Call:
## lm(formula = SENTTCAP ~ as.factor(MONSEX) + BASEHI + as.factor(IS924C) + 
##     as.factor(NEWCNVTN) + as.factor(MONACCEP) + as.factor(HISPORIG), 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -200.857  -41.474   -5.686   43.040  277.601 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               24.9978    18.5229   1.350 0.177611    
## as.factor(MONSEX)1       -21.8132    11.8399  -1.842 0.065866 .  
## BASEHI                     4.5462     0.4314  10.539  < 2e-16 ***
## as.factor(IS924C)Yes      44.8678     7.6449   5.869 6.89e-09 ***
## as.factor(NEWCNVTN)Trial  85.8001    24.5087   3.501 0.000495 ***
## as.factor(MONACCEP)-2     16.1260     9.4403   1.708 0.088061 .  
## as.factor(MONACCEP)0      65.8036    23.7887   2.766 0.005828 ** 
## as.factor(HISPORIG)1      -5.3649    14.0937  -0.381 0.703578    
## as.factor(HISPORIG)2     -20.0518    15.3632  -1.305 0.192278    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61.7 on 670 degrees of freedom
## Multiple R-squared:  0.3884, Adjusted R-squared:  0.3811 
## F-statistic: 53.18 on 8 and 670 DF,  p-value: < 2.2e-16

Model 3 - Backwards regression + Essential Variables

#Variables from Backwards Step Regression
final_model = lm(formula = SENTTCAP ~ as.factor(MONSEX) + BASEHI + as.factor(IS924C) + 
    as.factor(NEWCNVTN) + as.factor(MONACCEP) + as.factor(HISPORIG) + 
    as.factor(COMBDRG2) + logWGT1, data = train)
summary(final_model)
## 
## Call:
## lm(formula = SENTTCAP ~ as.factor(MONSEX) + BASEHI + as.factor(IS924C) + 
##     as.factor(NEWCNVTN) + as.factor(MONACCEP) + as.factor(HISPORIG) + 
##     as.factor(COMBDRG2) + logWGT1, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -193.62  -41.62   -5.30   42.64  275.62 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               35.51408   20.10125   1.767 0.077727 .  
## as.factor(MONSEX)1       -23.82958   11.89806  -2.003 0.045603 *  
## BASEHI                     3.52117    0.93011   3.786 0.000167 ***
## as.factor(IS924C)Yes      44.35615    7.74228   5.729 1.53e-08 ***
## as.factor(NEWCNVTN)Trial  86.97778   24.63863   3.530 0.000444 ***
## as.factor(MONACCEP)-2     15.53321    9.74883   1.593 0.111559    
## as.factor(MONACCEP)0      67.53275   23.93683   2.821 0.004926 ** 
## as.factor(HISPORIG)1      -5.08015   14.23482  -0.357 0.721294    
## as.factor(HISPORIG)2     -20.56130   15.55303  -1.322 0.186619    
## as.factor(COMBDRG2)2       2.29351   11.24384   0.204 0.838432    
## as.factor(COMBDRG2)3      -0.07739   10.35100  -0.007 0.994037    
## as.factor(COMBDRG2)4     -15.12616   27.34088  -0.553 0.580283    
## as.factor(COMBDRG2)6      16.07370   12.21920   1.315 0.188813    
## logWGT1                    1.55209    2.56648   0.605 0.545547    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61.68 on 665 degrees of freedom
## Multiple R-squared:  0.3932, Adjusted R-squared:  0.3813 
## F-statistic: 33.15 on 13 and 665 DF,  p-value: < 2.2e-16
#Weight and weapon chargers 
#Assumption Plots
#Plot graphs individually
library(here)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(ggfortify)
library(MASS)
library(lindia)
library(olsrr)

#Ensuring Model Meets Assumptions
autoplot(final_model,which=1, ncol = 1, label.size = 3) + theme_bw()#Residual vs. Fitted

autoplot(final_model,which=2, ncol = 1, label.size = 3) + theme_bw() #QQ

autoplot(final_model, which=4, ncol = 1, label.size = 3) + theme_bw() #Cook's distanc 

#All points are below .5

#Multicolinearity Test
library(car)
vif(final_model) #All Under 10
##                          GVIF Df GVIF^(1/(2*Df))
## as.factor(MONSEX)    1.032887  1        1.016310
## BASEHI               7.932103  1        2.816399
## as.factor(IS924C)    1.038758  1        1.019195
## as.factor(NEWCNVTN)  6.005971  1        2.450708
## as.factor(MONACCEP) 10.735437  2        1.810110
## as.factor(HISPORIG)  1.060299  2        1.014746
## as.factor(COMBDRG2)  4.423874  4        1.204274
## logWGT1              5.783147  1        2.404817
#Residual Plot
res <- resid(final_model) 
plot(fitted(final_model), res) 
abline(0,0) #As random as we can get for real data

## Comparing Linear Models Using Metrics

#R-Squared
summary(trial)$adj.r.squared
## [1] 0.376496
summary(trial_step)$adj.r.squared #Highest
## [1] 0.3810688
summary(final_model)$adj.r.squared #Very close to trial_step
## [1] 0.3813235
#AIC
AIC(trial)
## [1] 7558.348
AIC(trial_step)#Lowest
## [1] 7535.841
AIC(final_model)#Very close to trial_step
## [1] 7540.475
#BIC
BIC(trial)
## [1] 7684.925
BIC(trial_step)#Lowest
## [1] 7581.047
BIC(final_model)#Very close to trial_step
## [1] 7608.285
#Predictions
trial_predictions = predict(trial, newdata = test)
trial_step_predictions = predict(trial_step, newdata = test)
final_model_predictions = predict(final_model,newdata = test)

sqrt(mean((trial_predictions - test$SENTTCAP)^2)) / sd(test$SENTTCAP)
## [1] 0.7951826
sqrt(mean((trial_step_predictions- test$SENTTCAP)^2)) / sd(test$SENTTCAP)
## [1] 0.8032512
sqrt(mean((final_model_predictions - test$SENTTCAP)^2)) / sd(test$SENTTCAP)
## [1] 0.7959385

We can see that our backwards step regression has the lowest AIC and BIC as well as the highest r-squared out performing the other models with the metrics we used to pick our model. However the metrics are very similar in the backwards step and the third model which adds two variables that are used to calculate sentence length. Therefore we are choosing the third model as our final linear regression model. We will be looking at the summary of that model to see which variables are significant.

Decision Trees

Decision Tree 1 - All relevant Variables

library(rpart)
model = rpart(SENTTCAP ~ AGE + MONSEX + MONRACE + NEWEDUC + logWGT1 + BASEHI + IS924C + NEWCNVTN + CITIZEN + MONACCEP + HISPORIG, data = train)
plot(model, compress = TRUE)
text(model, cex = 0.7, use.n = TRUE, fancy = FALSE, all = TRUE)

summary(model)
## Call:
## rpart(formula = SENTTCAP ~ AGE + MONSEX + MONRACE + NEWEDUC + 
##     logWGT1 + BASEHI + IS924C + NEWCNVTN + CITIZEN + MONACCEP + 
##     HISPORIG, data = train)
##   n= 679 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.21562933      0 1.0000000 1.0016502 0.07613749
## 2 0.07476488      1 0.7843707 0.7921686 0.04812891
## 3 0.03210995      2 0.7096058 0.7383133 0.04656169
## 4 0.02117246      3 0.6774958 0.7185046 0.04565492
## 5 0.01799232      4 0.6563234 0.6907711 0.04091291
## 6 0.01286577      5 0.6383311 0.7238492 0.04708320
## 7 0.01079762      6 0.6254653 0.7223742 0.04736420
## 8 0.01000000      7 0.6146677 0.7331055 0.04850992
## 
## Variable importance
## NEWCNVTN MONACCEP   BASEHI  logWGT1   IS924C      AGE  MONRACE   MONSEX 
##       33       29       19        8        5        3        2        1 
## 
## Node number 1: 679 observations,    complexity param=0.2156293
##   mean=149.9564, MSE=6140.975 
##   left son=2 (639 obs) right son=3 (40 obs)
##   Primary splits:
##       NEWCNVTN splits as  LR,           improve=0.21562930, (0 missing)
##       MONACCEP < -1       to the left,  improve=0.20022640, (0 missing)
##       BASEHI   < 23       to the left,  improve=0.08837669, (0 missing)
##       logWGT1  < 2.915161 to the left,  improve=0.05904638, (0 missing)
##       IS924C   splits as  LR,           improve=0.04446148, (0 missing)
##   Surrogate splits:
##       MONACCEP < -1       to the left,  agree=0.99, adj=0.825, (0 split)
## 
## Node number 2: 639 observations,    complexity param=0.07476488
##   mean=140.852, MSE=4469.77 
##   left son=4 (361 obs) right son=5 (278 obs)
##   Primary splits:
##       BASEHI   < 29       to the left,  improve=0.10914870, (0 missing)
##       logWGT1  < 2.915161 to the left,  improve=0.06693642, (0 missing)
##       MONACCEP < -2.5     to the right, improve=0.04564653, (0 missing)
##       IS924C   splits as  LR,           improve=0.02959730, (0 missing)
##       MONRACE  < 1.5      to the right, improve=0.01019195, (0 missing)
##   Surrogate splits:
##       logWGT1 < 5.490483 to the left,  agree=0.750, adj=0.424, (0 split)
##       MONRACE < 1.5      to the right, agree=0.645, adj=0.183, (0 split)
##       MONSEX  < 0.5      to the left,  agree=0.588, adj=0.054, (0 split)
##       AGE     < 59.5     to the left,  agree=0.570, adj=0.011, (0 split)
##       CITIZEN < 1.5      to the left,  agree=0.567, adj=0.004, (0 split)
## 
## Node number 3: 40 observations,    complexity param=0.01079762
##   mean=295.3995, MSE=10360.62 
##   left son=6 (32 obs) right son=7 (8 obs)
##   Primary splits:
##       BASEHI  < 31       to the left,  improve=0.10864000, (0 missing)
##       IS924C  splits as  LR,           improve=0.10747810, (0 missing)
##       AGE     < 31.5     to the left,  improve=0.07504421, (0 missing)
##       logWGT1 < 3.598495 to the left,  improve=0.05090463, (0 missing)
##       NEWEDUC splits as  LRL-,         improve=0.04295453, (0 missing)
##   Surrogate splits:
##       logWGT1 < 7.047279 to the left,  agree=0.825, adj=0.125, (0 split)
## 
## Node number 4: 361 observations,    complexity param=0.03210995
##   mean=121.469, MSE=3680.832 
##   left son=8 (321 obs) right son=9 (40 obs)
##   Primary splits:
##       IS924C   splits as  LR,           improve=0.10076130, (0 missing)
##       BASEHI   < 17       to the left,  improve=0.06494533, (0 missing)
##       logWGT1  < 1.641826 to the left,  improve=0.06012477, (0 missing)
##       MONACCEP < -2.5     to the right, improve=0.03883086, (0 missing)
##       AGE      < 48.5     to the right, improve=0.03626606, (0 missing)
##   Surrogate splits:
##       AGE < 22.5     to the right, agree=0.895, adj=0.05, (0 split)
## 
## Node number 5: 278 observations,    complexity param=0.02117246
##   mean=166.022, MSE=4372.857 
##   left son=10 (196 obs) right son=11 (82 obs)
##   Primary splits:
##       BASEHI   < 33       to the left,  improve=0.072622030, (0 missing)
##       logWGT1  < 5.720952 to the left,  improve=0.038467160, (0 missing)
##       AGE      < 29.5     to the right, improve=0.010684270, (0 missing)
##       HISPORIG < 0.5      to the right, improve=0.006375651, (0 missing)
##       MONSEX   < 0.5      to the right, improve=0.006002461, (0 missing)
##   Surrogate splits:
##       logWGT1 < 6.929611 to the left,  agree=0.781, adj=0.256, (0 split)
##       AGE     < 64.5     to the left,  agree=0.716, adj=0.037, (0 split)
## 
## Node number 6: 32 observations
##   mean=278.6247, MSE=10139.18 
## 
## Node number 7: 8 observations
##   mean=362.4987, MSE=5618.481 
## 
## Node number 8: 321 observations,    complexity param=0.01799232
##   mean=114.6708, MSE=3025.351 
##   left son=16 (125 obs) right son=17 (196 obs)
##   Primary splits:
##       BASEHI   < 20.5     to the left,  improve=0.07725268, (0 missing)
##       logWGT1  < 2.73354  to the left,  improve=0.05644127, (0 missing)
##       AGE      < 53.5     to the right, improve=0.03951030, (0 missing)
##       MONACCEP < -2.5     to the right, improve=0.03723461, (0 missing)
##       HISPORIG < 1.5      to the right, improve=0.02027234, (0 missing)
##   Surrogate splits:
##       logWGT1  < 2.779371 to the left,  agree=0.866, adj=0.656, (0 split)
##       MONACCEP < -2.5     to the right, agree=0.801, adj=0.488, (0 split)
##       AGE      < 29.5     to the left,  agree=0.639, adj=0.072, (0 split)
##       NEWEDUC  splits as  LRRR,         agree=0.623, adj=0.032, (0 split)
## 
## Node number 9: 40 observations
##   mean=176.025, MSE=5593.824 
## 
## Node number 10: 196 observations
##   mean=154.4956, MSE=3507.451 
## 
## Node number 11: 82 observations,    complexity param=0.01286577
##   mean=193.573, MSE=5364.763 
##   left son=22 (65 obs) right son=23 (17 obs)
##   Primary splits:
##       AGE     < 43.5     to the left,  improve=0.121949100, (0 missing)
##       logWGT1 < 8.628586 to the left,  improve=0.047653750, (0 missing)
##       NEWEDUC splits as  LLR-,         improve=0.012566850, (0 missing)
##       MONRACE < 1.5      to the right, improve=0.007142106, (0 missing)
##       BASEHI  < 37       to the left,  improve=0.007119122, (0 missing)
##   Surrogate splits:
##       MONRACE < 3        to the left,  agree=0.817, adj=0.118, (0 split)
##       CITIZEN < 1.5      to the left,  agree=0.805, adj=0.059, (0 split)
## 
## Node number 16: 125 observations
##   mean=95.52744, MSE=2477.31 
## 
## Node number 17: 196 observations
##   mean=126.8795, MSE=2992.096 
## 
## Node number 22: 65 observations
##   mean=180.4923, MSE=4090.742 
## 
## Node number 23: 17 observations
##   mean=243.5876, MSE=7080.329
library(tree)
set.seed(2)
tree <- tree(SENTTCAP ~ AGE + MONSEX + MONRACE + NEWEDUC + logWGT1 + BASEHI + IS924C + NEWCNVTN + CITIZEN + MONACCEP + HISPORIG, data = train)
?tree
plot(tree)
text(tree, cex=0.7)

Pruning Tree

cv.filter2019 = cv.tree(tree, FUN = prune.tree)
cv.filter2019
## $size
## [1] 9 7 6 5 4 3 2 1
## 
## $dev
## [1] 2987906 2984479 2984300 2896480 2922575 3173075 3323431 4258765
## 
## $k
## [1]      -Inf  51988.23  53646.70  75022.99  88283.27 133889.56 311748.76
## [8] 899114.37
## 
## $method
## [1] "deviance"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
plot(cv.filter2019$size, cv.filter2019$dev, type='b')
title(main = "Deviance VS Tree Size")

prune.filter2019 = prune.tree(tree, best = 5)
plot(prune.filter2019)
text(prune.filter2019, cex=0.6)

Make predictions and NRMSE

#Predicted variables for Tree
yhat_tree = predict(tree, newdata = test)
?predict
plot(yhat_tree, test$SENTTCAP)

sqrt(mean((yhat_tree - test$SENTTCAP)^2)) / sd(test$SENTTCAP)
## [1] 0.7921585
#Predicted Values for Pruned Down Tree
yhat = predict(prune.filter2019, newdata = test)
plot(yhat, test$SENTTCAP)

sqrt(mean((yhat - test$SENTTCAP)^2)) / sd(test$SENTTCAP)
## [1] 0.7927329